In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt

In [ ]:
import sys, os, re, time
import urllib

import numpy as np

from IPython import parallel

Downloading from flickr

This flickr parsing code is adapted from here


In [ ]:
def extract_urls(html):
    """Extract images URLs from a page."""
    re_imageurl = re.compile(r'src="(http://farm\d+.static.?flickr.com/\d+/\d+_\w+.jpg)"',re.IGNORECASE|re.DOTALL)
    urls = re_imageurl.findall(html)
    if len(urls)==0:
        return []
    return urls

In [ ]:
def urls_for_tag(tag='face', min_images=100, max_pages=20):
    """get urls to flickr images with given tag(s)

    scrapes flickr search page
    """
    urls = []
    page = 1
    while len(urls) < min_images and page <= max_pages:
        url = 'http://www.flickr.com/search/?q=%s&l=cc&ss=0&ct=0&mt=photos&w=all&adv=1&m=tags&page=%i' % (tag, page)
        print "fetching %s" % url
        urlfile = urllib.urlopen(url)
        # global html
        html= urlfile.read()
        # print html
        urlfile.close()
        page_urls = extract_urls(html)
        urls.extend(page_urls)
        print "found %i images" % len(urls)
        if not len(page_urls):
            print "no new images"
            break
        page += 1
        
    return urls

In [ ]:
urls = urls_for_tag('portrait', 500)

In [ ]:
def download_image(url, dest_dir='images'):
    """download an image from a url into a directory

    returns the path to the downloaded image.
    """
    import os
    basename = url.rsplit('/', 1)[-1]
    dest = os.path.join(dest_dir, basename)
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
    if os.path.exists(dest):
        print "already have %s" % dest
        return dest
    
    print "downloading %s -> %s" % (url, dest)
    urlf = urllib.urlopen(url)
    data = urlf.read()
    urlf.close()
    with open(dest, 'w') as f:
        f.write(data)
    return dest

First, initialize OpenCV for simple facial detection


In [ ]:
HAAR_CASCADE_PATH = "haarcascade_frontalface_default.xml"
# if you have opencv installed via homebrew, this would be in
# /usr/local/share/OpenCV/haarcascades/

import cv
storage = cv.CreateMemStorage()
cascade = cv.Load(HAAR_CASCADE_PATH)

Then define a few functions for extracting faces from images


In [ ]:
def extract_faces(image, faces):
    """Returns any faces in an image in a list of numpy arrays"""
    import numpy as np
    A = np.frombuffer(image.tostring(), dtype=np.uint8).reshape((image.height, image.width, image.nChannels))
    A = A[:,:,::-1]
    face_arrays = []
    for face in faces:
        Aface = A[face[1]:face[1]+face[3],face[0]:face[0]+face[2]]
        face_arrays.append(Aface)
    return face_arrays


def detect_faces(filename):
    """Loads an image into OpenCV, and detects faces

    returns None if no image is found,
    (filename, [list of numpy arrays]) if there are faces
    """
    
    image = cv.LoadImage(filename)
    faces = []
    detected = cv.HaarDetectObjects(image, cascade, storage, 1.2, 2, cv.CV_HAAR_DO_CANNY_PRUNING, (100,100))
    if detected:
        for (x,y,w,h),n in detected:
            faces.append((x,y,w,h))
    if faces:
        return filename, extract_faces(image, faces)

And finally, a two-step function that downloads an image from a url, and detects faces in it.


In [ ]:
def faces_in_url(url):
    """detect faces in an image downloaded from a url"""
    img_path = download_image(url)
    return detect_faces(img_path)

If the network doesn't work, you can just generate a list of paths to images on your computer. For instance, these pictures are just everything from my iPhoto thumbnails directory, so vary from ~320x240 - 1024x768


In [ ]:
import glob
library = os.path.expanduser("~/Pictures/2013.iphotolibrary")
pictures = []
for directory, subdirs, files in os.walk(os.path.join(library, 'Thumbnails')):
    for fname in files:
        if fname.endswith('.jpg'):
            pictures.append(os.path.join(directory, fname))

Or this one, which globs pictures from a particular folder:


In [ ]:
import glob
pictures = glob.glob("images/*/*.jpg")

Let's test our


In [ ]:
for url in urls:
    found = faces_in_url(url)
    if found:
        break

filename, faces = found
for face in faces:
    plt.figure()
    plt.imshow(face)

If the network isn't kind to you, we can skip the downloads, and just use pictures we have on the filesystem:


In [ ]:
for p in pictures:
    found = detect_faces(p)
    if found:
        break

filename, faces = found
for face in faces:
    plt.figure()
    plt.imshow(face)

Hey, that looks like a face!

Now in parallel

First, we connect our parallel Client


In [ ]:
rc = parallel.Client()
all_engines = rc[:]
view = rc.load_balanced_view()

Then we initialize OpenCV on all of the engines (identical to what we did above)


In [ ]:
%%px
%cd notebooks/parallel

In [ ]:
%%px
HAAR_CASCADE_PATH = "haarcascade_frontalface_default.xml"

import os, urllib
import cv
storage = cv.CreateMemStorage()
cascade = cv.Load(HAAR_CASCADE_PATH)

and make sure extract_faces is defined everywhere


In [ ]:
all_engines.push(dict(
    extract_faces=extract_faces,
    detect_faces=detect_faces,
    download_image=download_image,
))

Now we can iterate through all of our pictures, and detect and display any faces we find


In [ ]:
tic = time.time()
# if you are running offline, do this one:
# f = detect_faces
# source = pictures

# or you can download each image as part of the task:
f = faces_in_url
source = urls


amr = view.map_async(f, source[:1000], ordered=False)
nfound = 0
for r in amr:
    if not r:
        continue
    filename, faces = r
    nfound += len(faces)
    print "%i faces found in %s" % (len(faces), filename)
    for face in faces:
        plt.imshow(face)
        plt.show()

toc = time.time()

print "found %i faces in %i images in %f s" % (nfound, len(amr), toc-tic)